From 1a86c2ca603aad0fe6bf9c936e0e7d3416cf539c Mon Sep 17 00:00:00 2001 From: "kaf24@freefall.cl.cam.ac.uk" Date: Wed, 8 Sep 2004 17:36:26 +0000 Subject: [PATCH] bitkeeper revision 1.1159.74.1 (413f431aIw1lNSoAl0H63fnzXoSZ8Q) Fix TLB flushing on page-type changes. In particular, page-table pages must trigger a flush when their type changes. To simplify the code we also flush for LDT/GDT pages, but they will change type very infrequently. --- xen/arch/x86/domain.c | 1 - xen/arch/x86/memory.c | 159 +++++++++++++++++++++++++++++++++++---- xen/include/asm-x86/mm.h | 152 ++++--------------------------------- 3 files changed, 159 insertions(+), 153 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 8bbe2968b2..f16597ad73 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -733,7 +733,6 @@ int construct_dom0(struct domain *p, *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT); page = &frame_table[mfn]; - set_bit(_PGC_tlb_flush_on_type_change, &page->count_info); if ( !get_page_and_type(page, p, PGT_writable_page) ) BUG(); diff --git a/xen/arch/x86/memory.c b/xen/arch/x86/memory.c index 8ca0533973..a41b76065f 100644 --- a/xen/arch/x86/memory.c +++ b/xen/arch/x86/memory.c @@ -462,7 +462,6 @@ get_page_from_l1e( { if ( unlikely(!get_page_type(page, PGT_writable_page)) ) return 0; - set_bit(_PGC_tlb_flush_on_type_change, &page->count_info); } return 1; @@ -774,18 +773,6 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) int alloc_page_type(struct pfn_info *page, unsigned int type) { - if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change, - &page->count_info)) ) - { - struct domain *p = page->u.inuse.domain; - if ( unlikely(NEED_FLUSH(tlbflush_time[p->processor], - page->tlbflush_timestamp)) ) - { - perfc_incr(need_flush_tlb_flush); - flush_tlb_cpu(p->processor); - } - } - switch ( type ) { case PGT_l1_page_table: @@ -833,6 +820,151 @@ void free_page_type(struct pfn_info *page, unsigned int type) } +void put_page_type(struct pfn_info *page) +{ + u32 nx, x, y = page->u.inuse.type_info; + + again: + do { + x = y; + nx = x - 1; + + ASSERT((x & PGT_count_mask) != 0); + ASSERT(x & PGT_validated); + + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + /* Record TLB information for flush later. Races are harmless. */ + page->tlbflush_timestamp = tlbflush_clock; + + if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) ) + { + /* + * Page-table pages must be unvalidated when count is zero. The + * 'free' is safe because the refcnt is non-zero and validated + * bit is clear => other ops will spin or fail. + */ + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, + x & ~PGT_validated)) != x) ) + goto again; + /* We cleared the 'valid bit' so we do the clear up. */ + free_page_type(page, x & PGT_type_mask); + /* Carry on, but with the 'valid bit' now clear. */ + x &= ~PGT_validated; + nx &= ~PGT_validated; + } + } + else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == + (PGT_pinned | 1)) ) + { + /* Page is now only pinned. Make the back pointer mutable again. */ + nx |= PGT_va_mutable; + } + } + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); +} + + +int get_page_type(struct pfn_info *page, u32 type) +{ + u32 nx, x, y = page->u.inuse.type_info; + + again: + do { + x = y; + nx = x + 1; + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page)); + return 0; + } + else if ( unlikely((x & PGT_count_mask) == 0) ) + { + if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) + { + /* + * On type change we check to flush stale TLB entries. This + * may be unnecessary (e.g., page was GDT/LDT) but those + * circumstances should be very rare. + */ + struct domain *d = page->u.inuse.domain; + if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor], + page->tlbflush_timestamp)) ) + { + perfc_incr(need_flush_tlb_flush); + flush_tlb_cpu(d->processor); + } + + /* We lose existing type, back pointer, and validity. */ + nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); + nx |= type; + + /* No special validation needed for writable pages. */ + /* Page tables and GDT/LDT need to be scanned for validity. */ + if ( type == PGT_writable_page ) + nx |= PGT_validated; + } + } + else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) + { + if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) + { + if ( ((x & PGT_type_mask) != PGT_l2_page_table) || + ((type & PGT_type_mask) != PGT_l1_page_table) ) + MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n", + x & PGT_type_mask, type, page_to_pfn(page)); + return 0; + } + else if ( (x & PGT_va_mask) == PGT_va_mutable ) + { + /* The va backpointer is mutable, hence we update it. */ + nx &= ~PGT_va_mask; + nx |= type; /* we know the actual type is correct */ + } + else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) + { + /* The va backpointer wasn't mutable, and is different. */ + MEM_LOG("Unexpected va backpointer (saw %08x != exp %08x)" + " for pfn %08lx\n", x, type, page_to_pfn(page)); + return 0; + } + } + else if ( unlikely(!(x & PGT_validated)) ) + { + /* Someone else is updating validation of this page. Wait... */ + while ( (y = page->u.inuse.type_info) == x ) + { + rep_nop(); + barrier(); + } + goto again; + } + } + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); + + if ( unlikely(!(nx & PGT_validated)) ) + { + /* Try to validate page type; drop the new reference on failure. */ + if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) + { + MEM_LOG("Error while validating pfn %08lx for type %08x." + " caf=%08x taf=%08x\n", + page_to_pfn(page), type, + page->count_info, + page->u.inuse.type_info); + /* Noone else can get a reference. We hold the only ref. */ + page->u.inuse.type_info = 0; + return 0; + } + + /* Noone else is updating simultaneously. */ + __set_bit(_PGT_validated, &page->u.inuse.type_info); + } + + return 1; +} + + static int do_extended_command(unsigned long ptr, unsigned long val) { int okay = 1, cpu = smp_processor_id(); @@ -1747,7 +1879,6 @@ int ptwr_do_page_fault(unsigned long addr) #ifndef NDEBUG void ptwr_status(void) { - int i; unsigned long pte, pfn; struct pfn_info *page; l2_pgentry_t *pl2e; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 18e70240d1..50d5988329 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -81,17 +81,14 @@ struct pfn_info /* 17-bit count of uses of this frame as its current type. */ #define PGT_count_mask ((1<<17)-1) - /* For safety, force a TLB flush when this page's type changes. */ -#define _PGC_tlb_flush_on_type_change 31 -#define PGC_tlb_flush_on_type_change (1<<_PGC_tlb_flush_on_type_change) /* Cleared when the owning guest 'frees' this page. */ -#define _PGC_allocated 30 +#define _PGC_allocated 31 #define PGC_allocated (1<<_PGC_allocated) /* This bit is always set, guaranteeing that the count word is never zero. */ -#define _PGC_always_set 29 +#define _PGC_always_set 30 #define PGC_always_set (1<<_PGC_always_set) - /* 29-bit count of references to this frame. */ -#define PGC_count_mask ((1<<29)-1) + /* 30-bit count of references to this frame. */ +#define PGC_count_mask ((1<<30)-1) /* We trust the slab allocator in slab.c, and our use of it. */ #define PageSlab(page) (1) @@ -104,7 +101,7 @@ struct pfn_info do { \ (_pfn)->u.inuse.domain = (_dom); \ /* The incremented type count is intended to pin to 'writable'. */ \ - (_pfn)->u.inuse.type_info = PGT_writable_page | PGT_validated | 1; \ + (_pfn)->u.inuse.type_info = PGT_writable_page | PGT_validated | 1; \ wmb(); /* install valid domain ptr before updating refcnt. */ \ spin_lock(&(_dom)->page_alloc_lock); \ /* _dom holds an allocation reference */ \ @@ -143,155 +140,34 @@ static inline int get_page(struct pfn_info *page, struct domain *domain) { u32 x, nx, y = page->count_info; - struct domain *p, *np = page->u.inuse.domain; + struct domain *d, *nd = page->u.inuse.domain; do { x = y; nx = x + 1; - p = np; + d = nd; if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */ unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ - unlikely(p != domain) ) /* Wrong owner? */ + unlikely(d != domain) ) /* Wrong owner? */ { DPRINTK("Error pfn %08lx: ed=%p, sd=%p, caf=%08x, taf=%08x\n", - page_to_pfn(page), domain, p, + page_to_pfn(page), domain, d, x, page->u.inuse.type_info); return 0; } __asm__ __volatile__( LOCK_PREFIX "cmpxchg8b %3" - : "=d" (np), "=a" (y), "=c" (p), + : "=d" (nd), "=a" (y), "=c" (d), "=m" (*(volatile u64 *)(&page->count_info)) - : "0" (p), "1" (x), "c" (p), "b" (nx) ); - } - while ( unlikely(np != p) || unlikely(y != x) ); - - return 1; -} - - -static inline void put_page_type(struct pfn_info *page) -{ - u32 nx, x, y = page->u.inuse.type_info; - - again: - do { - x = y; - nx = x - 1; - if ( unlikely((nx & PGT_count_mask) == 0) ) - { - page->tlbflush_timestamp = tlbflush_clock; - if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && - likely(nx & PGT_validated) ) - { - /* - * Page-table pages must be unvalidated when count is zero. The - * 'free' is safe because the refcnt is non-zero and the - * validated bit is clear => other ops will spin or fail. - */ - if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, - x & ~PGT_validated)) != x) ) - goto again; - /* We cleared the 'valid bit' so we must do the clear up. */ - free_page_type(page, x & PGT_type_mask); - /* Carry on as we were, but with the 'valid bit' now clear. */ - x &= ~PGT_validated; - nx &= ~PGT_validated; - } - } - else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == - (PGT_pinned | 1)) ) - { - /* Page is now only pinned. Make the back pointer mutable again. */ - nx |= PGT_va_mutable; - } - } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); -} - - -static inline int get_page_type(struct pfn_info *page, u32 type) -{ - u32 nx, x, y = page->u.inuse.type_info; - again: - do { - x = y; - nx = x + 1; - if ( unlikely((nx & PGT_count_mask) == 0) ) - { - DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page)); - return 0; - } - else if ( unlikely((x & PGT_count_mask) == 0) ) - { - if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) - { - nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); - nx |= type; - /* No extra validation needed for writable pages. */ - if ( type == PGT_writable_page ) - nx |= PGT_validated; - } - } - else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) - { - if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) - { -#ifdef VERBOSE - if ( ((x & PGT_type_mask) != PGT_l2_page_table) || - ((type & PGT_type_mask) != PGT_l1_page_table) ) - DPRINTK("Bad type (saw %08x != exp %08x) for pfn %08lx\n", - x & PGT_type_mask, type, page_to_pfn(page)); -#endif - return 0; - } - else if ( (x & PGT_va_mask) == PGT_va_mutable ) - { - /* The va backpointer is mutable, hence we update it. */ - nx &= ~PGT_va_mask; - nx |= type; /* we know the actual type is correct */ - } - else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) - { - /* The va backpointer wasn't mutable, and is different. */ - DPRINTK("Unexpected va backpointer (saw %08x != exp %08x)" - " for pfn %08lx\n", x, type, page_to_pfn(page)); - return 0; - } - } - else if ( unlikely(!(x & PGT_validated)) ) - { - /* Someone else is updating validation of this page. Wait... */ - while ( (y = page->u.inuse.type_info) == x ) - { - rep_nop(); - barrier(); - } - goto again; - } - } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); - - if ( unlikely(!(nx & PGT_validated)) ) - { - /* Try to validate page type; drop the new reference on failure. */ - if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) - { - DPRINTK("Error while validating pfn %08lx for type %08x." - " caf=%08x taf=%08x\n", - page_to_pfn(page), type, - page->count_info, - page->u.inuse.type_info); - put_page_type(page); - return 0; - } - - set_bit(_PGT_validated, &page->u.inuse.type_info); + : "0" (d), "1" (x), "c" (d), "b" (nx) ); } + while ( unlikely(nd != d) || unlikely(y != x) ); return 1; } +void put_page_type(struct pfn_info *page); +int get_page_type(struct pfn_info *page, u32 type); static inline void put_page_and_type(struct pfn_info *page) { -- 2.30.2